package com.suyunyou.spider.plugins.page;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.suyunyou.spider.model.Link;
import com.suyunyou.spider.model.PageDtl;
import com.suyunyou.spider.model.SpiderException;
import com.suyunyou.spider.plugins.IPagePlugin;
import com.suyunyou.spider.utils.PageDownUtil;
import com.system.comm.utils.FrameJsonUtil;

/**
 * 爬取速云优的插件
 * @author yuejing
 * @date 2016年6月25日 下午4:45:30
 * @version V1.0.0
 */
public class PageDtlPlugin extends IPagePlugin {

	private static final Logger LOGGER = Logger.getLogger(PageDtlPlugin.class);
	private String titleSelect;
	private String contentSelect;
	
	public PageDtlPlugin(String regex, String titleSelect, String contentSelect) {
		this.titleSelect = titleSelect;
		this.contentSelect = contentSelect;
		super.setTargetRegex(regex);
	}

	@Override
	public PageDtl setPageDtl() throws SpiderException {
		PageDtl dtl = new PageDtl();
		Document doc = Jsoup.parse(getLink().getContent());
		Elements conts = doc.select(contentSelect);
		if(conts.size() == 0) {
			//没有找到正文
			return null;
		}
		String title = doc.select(titleSelect).text();
		dtl.setTitle(title);
		String content = conts.get(0).html();
		dtl.setContent(content);
		LOGGER.info("找到文章啦 ~ 我要写入数据库中。哈哈 [" + getLink().getLink() + "]");
		return dtl;
	}

	public static void main(String[] args) throws SpiderException {
		/*String link = "http://www.blogjava.net/sealyu/archive/2009/12/24/307165.html";
		String regex = "http://www.blogjava.net/[\\w|\\d]+/[\\w|\\d]+/\\d+/\\d+/\\d+/\\d+.html";*/
		String link = "http://www.runoob.com/java/java-environment-setup.html";
		String regex = "http://www.runoob.com/java/[\\w|\\d|-]+.html";
		
		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
		Matcher matcher = p.matcher(link);
		while (matcher.find()) {
			System.out.println(matcher.group());
		}

		String titleSelect = "#content>h1";
		String contentSelect = "#content";
		PageDtlPlugin plugin = new PageDtlPlugin(regex, titleSelect, contentSelect);
		Link l = new Link();
		l.setLink(link);
		l.setContent(PageDownUtil.get(link));
		plugin.setLink(l);
		PageDtl page = plugin.setPageDtl();
		System.out.println(FrameJsonUtil.toString(page));
	}
}